
import pandas as pd
import numpy as np
import jieba
import re
from sklearn.model_selection import train_test_split
# from gensim.models.word2vec import Word2Vec
from nlu_model.ptm.word2vector import Word2vector
from nlu_model.util.pkl_impl import save_pkl, load_pkl
from nlu_model.sim.model_pytorch.model_train import TrainModelPipeline


def load_data(path):
    sentence = []
    label = []
    with open(path) as f:
        for line in f:
            ll = line.strip().split("\t")
            sentence.append([ll[1], ll[2]])
            label.append(-1 if int(ll[3]) < 1 else 1)
    return sentence, label

def single2double(sentence):
    sentence_1 = []
    sentence_2 = []
    for pair in sentence:
        sentence_1.append(pair[0])
        sentence_2.append(pair[1])
    return sentence_1, sentence_2

sentence, label = load_data("./data/sim/atec_sim/atec_nlp_sim_train_all.csv")

x_train, x_test, y_train, y_test = train_test_split(sentence, label, test_size=0.2,random_state=666)

x_train_1, x_train_2 = single2double(x_train)
x_test_1, x_test_2 = single2double(x_test)

word2vector = Word2vector()

word2vector.load("./data/ptm/shopping_reviews/w2v_word2idx2020100601.pkl",
                 "./data/ptm/shopping_reviews/w2v_model_metric_2020100601.pkl", 
                 "./data/ptm/shopping_reviews/w2v_model_conf_2020100601.pkl")

textCNN_param = {
    'vocab_size': len(word2vector.word2idx_dic),
    'embed_dim': 300,
    'class_num': 2,
    "kernel_num": 16,
    "kernel_size": [1, 2, 3, 4, 5],
    "dropout": 0.5,
    "pre_word_embeds": word2vector.embedding_weights,
    "output_vec":200
}

train_config = {
    "MODEL_CONF": textCNN_param,
    "batch_size": 64,
    "epoch":3
}

print(x_train_1[0],x_train_2[0])
x_train_1 = word2vector.batch2idx([list(jieba.cut(i)) for i  in x_train_1], batch_len = 20)
x_train_2 = word2vector.batch2idx([list(jieba.cut(i)) for i  in x_train_2], batch_len = 20)
print(x_train_1[0],x_train_2[0])
x_test_1 = word2vector.batch2idx([list(jieba.cut(i)) for i  in x_test_1], batch_len = 20)
x_test_2 = word2vector.batch2idx([list(jieba.cut(i)) for i  in x_test_2], batch_len = 20)

train_model_pipeline = TrainModelPipeline(train_config)
train_model_pipeline.call_train(x_train_1, x_train_2, y_train)
train_model_pipeline.call_evaluate(x_test_1, x_test_2, y_test)

